Pyclone-VI fitting

Building Pyclone input

In order to rerun the clustering assignments with pyclone-vi we first format the data to create the right input files.

format_to_pyclone <-  function(xx,purity){

  library(dplyr)
  
  
  # Format data for pyclone, we are using everything as it was clonal
  mutation_id <- paste0("chr",xx$Chromosome, ":", xx$Start_position, ":", xx$Start_position, ":", round(xx$ccf,3), ":", xx$gene, ":", xx$isdriver)
  ref_counts <-  xx$t_ref_count
  alt_counts <- xx$t_alt_count
  sample_id <- xx$Tumor_Sample_Barcode
  major_cn <- xx$major_cn
  minor_cn <-  xx$minor_cn
  normal_cn <-  2
  tumour_content <- purity
  gene_symb <- xx$gene

  df <-  data.frame(
    mutation_id = mutation_id,
    ref_counts = ref_counts,
    alt_counts = alt_counts,
    sample_id = sample_id,
    major_cn = as.integer(major_cn),
    minor_cn = as.integer(minor_cn),
    normal_cn = as.integer(normal_cn),
    tumour_content = tumour_content,
    gene_symb = gene_symb
  )
  
  # Before returning the dataframe we filter NAs in CNV and NV/NR values
  return(df %>%  filter(!is.na(major_cn), !is.na(ref_counts), !is.na(alt_counts)) %>% unique() %>%
           filter(!duplicated(mutation_id)) %>%  unique())

}

Loading input data

Here we assume you have the data as described in the PCAWG data vignette, the input here is a list of data.frame for each sample in the object snvs_list. For the metadata and the pyclone-vi installation the code is presented in the Germline overdispersion analysis vignette.

library(tidyverse)

# this file is produced at the end of the first vignete
maf_ccf <-  readr::read_csv("../processed_data/maf_subclone_ccf.csv.gz")

# you can download this file by running a chunk in Germline overdispersion analysis vignette
meta <-  readr::read_tsv("../germline_overdispersion_analysis/meta.tar.gz") %>% split(., .$samplename)

snvs_list <- maf_ccf %>% split(., .$Tumor_Sample_Barcode)

meta <-  meta[names(snvs_list)]

names <- names(snvs_list)


meta <- lapply(meta, function(x) x$purity)

py_clone_VI <-  mapply(snvs_list[1:32], meta,FUN = function(x,y) format_to_pyclone(x, y), SIMPLIFY = F)

mapply(py_clone_VI, names(py_clone_VI)[1:32], FUN = function(x,y) {

  dir.create(y, showWarnings = F)
  write.table(x, file = paste0("./", y, "/pyclone_input.tsv"), sep = "\t", row.names = F, quote = F)

})

Running the actual analysis

We then fit the pyclone-vi model and plot the data. The pyclone running script is written in the file pyclone_refits/pyclone.sh

run_pyclone <- function(x){
  
  # run the script in each directory and go back to the original wd
  setwd(x)
  if(!file.exists("pyclone_output.tsv"))
    tryCatch( system("bash ../pyclone.sh ")
      ,finally = setwd("..")
    ) 
  else 
    setwd("..")
}

plot_pyclone <- function(x){
  library(patchwork)
  library(tidyverse)
  # rplot ccf histogram in each directory and go back to the original wd
  setwd(x)
    
    inp <-  readr::read_tsv("pyclone_output.tsv") %>% separate(col = mutation_id,
                                                               into = c("chr", "from", "to", "ccf", "gene", "is_driver"),
                                                               sep = ":")
    
    # Clusters cellular prevalence plot
    p1 <-  ggplot(inp, aes(x = cellular_prevalence, fill = cluster_id %>% paste)) +
      geom_histogram() + theme_bw() + scale_fill_brewer("Cluster", palette = "Set1") + 
      ggtitle( paste0("Pyclone clustering ", inp$sample_id %>% unique()) )
    
    # As pyclone just returns the CCF per cluster we are using the PCAWG estimated CCF per mutation
    p2 <- ggplot(inp, aes(x = ccf %>% as.numeric, fill = cluster_id %>% paste)) +
      geom_histogram(bins = 100) + theme_bw() + scale_fill_brewer("Cluster", palette = "Set1") + 
      ggtitle( paste0("Pyclone CCF ", inp$sample_id %>% unique()) ) + xlab("CCF") 
      
    # Assamble and save the plots
    p12 <- p1 / p2
    
    p12 %>%  ggplot2::ggsave(filename = "pyclone_plot.pdf", units = "px", device = "pdf", width = 2400, height = 3600)
    
    setwd("..")
    
    return(p12)
 
  
}

Here we use the package easypar to implement parallel execution. For the sake of compiling the vignette we are just running the first 32 samples.

easypar::run(lapply(names[1:32] , list), FUN = run_pyclone, filter_errors = F, export = NULL,parallel = T,
             cores.ratio = 0.6)
plots <- easypar::run(lapply(names[1:32] , list), FUN = plot_pyclone, filter_errors = F, export = NULL,parallel = T,
             cores.ratio = 0.6)

We then assemble the results in a data.frame

read_pyclone_BB_res <- function(x) 
{
  readr::read_tsv(file.path(x, "pyclone_output.tsv"))
}
all_pyclone_BB <-  easypar::run(lapply(names, list),FUN =  read_pyclone_BB_res, export = NULL, filter_errors = F )

saveRDS(all_pyclone_BB, file = "./pyclone_BB_all.rds")

Plots

require(ggpubr)

plots